Analysis date: 2023-01-02

Depends on

CRC_Xenografts_FirstBatch_DataProcessing Script

load("../Data/Cache/Xenografts_Batch1_DataProcessing.RData")

TODO

Setup

Load libraries and functions

Functions

PLSDA

Tune_And_Perform_PLSDA <- function(X, Y, folds, repeats, scale = TRUE, upperlimitfeatures = 200, initial_nr_of_components = 10, progressBar = TRUE){
  plsda <- splsda( X = X, 
         Y= Y, scale = scale, ncomp = initial_nr_of_components)

    message("Done calculating initial PLSDA")
  message("Tuning components:")
  perf.plsda <- perf(plsda, validation = "Mfold", 
                            folds = folds, nrepeat = repeats, # use repeated cross-validation
                            progressBar = progressBar, auc = TRUE) # include AUC values
  plot(perf.plsda, col = color.mixo(5:7), 
       sd = TRUE,
       legend.position = "horizontal")
  #perf.plsda$choice.ncomp
  message("Done tuning components")
  
  # grid of possible keepX values that will be tested for each component
  list.keepX <- c(1:20,  seq(20, upperlimitfeatures, 10))
  
  message("Tune number of selected features:")
  # undergo the tuning process to determine the optimal number of variables
  tune.plsda <- tune.splsda(X = X, Y= Y, scale = scale,
         ncomp = perf.plsda$choice.ncomp[2,1], validation = 'Mfold',
         folds = folds, nrepeat = repeats, # use repeated cross-validation
         dist = 'max.dist', # use max.dist measure
         measure = "BER", # use balanced error rate of dist measure
         test.keepX = list.keepX,
         progressBar = progressBar,
         cpus = 2) 
  p_tune_feat <- plot(tune.plsda, col = color.jet(perf.plsda$choice.ncomp[2,1]))
  print(p_tune_feat)
  message(paste("Optimal number of components:", tune.plsda$choice.ncomp$ncomp ) )
  message("Optimal number of selected features per component:")
  print(tune.plsda$choice.keepX)
  optimal.ncomp <- tune.plsda$choice.ncomp$ncomp
  optimal.keepX <- tune.plsda$choice.keepX[1:optimal.ncomp]
  
  message("Calculate final sPLSDA model")
  final.splsda <- splsda(X, Y, 
                         ncomp = optimal.ncomp, 
                         keepX = optimal.keepX, scale = scale)
#  perf.final.splsda <- perf(final.splsda, 
#                          folds = folds, nrepeat = repeats, # use #repeated cross-validation
#                          validation = "Mfold", dist = "max.dist",  # #use max.dist measure
#                          progressBar = FALSE, auc = TRUE)
#  perf.final.splsda$auc
  return(final.splsda)
}

GSEA

Run_GSEA_PLSDA <- function(PLSDA_result, component){
  ranked_list <- 
    PLSDA_result$loadings$X %>% 
    as.data.frame() %>%
    rownames_to_column("peptide") %>%
    as_tibble() %>%
    separate(peptide, into = c("HGNC_Symbol", "Annotated_Sequence")) %>%
    select(HGNC_Symbol, FC = {{ component }} ) %>% 
    mutate(absFC = abs(FC) ) %>%
    group_by(HGNC_Symbol) %>%
    arrange(desc(absFC) ) %>%
    slice(1) %>%
    ungroup %>%
    arrange(FC)
  
  ranked_vector <- unlist(ranked_list[,2])
  names(ranked_vector) <- mapIds(org.Hs.eg.db, 
                                 keys = ranked_list$HGNC_Symbol,
                                 column = "ENTREZID", keytype = "SYMBOL")
  
  ranked_vector <- ranked_vector[!is.na(names(ranked_vector)) ]
  
  pathways <- reactomePathways(names(ranked_vector))
  fgseaRes <- fgsea(pathways, ranked_vector, maxSize=500)
  
  #print(head(fgseaRes))
  
  topPathwaysUp <- fgseaRes[ES > 0 & padj < 0.1][head(order(pval), n=10), pathway]
  topPathwaysDown <- fgseaRes[ES < 0 & padj < 0.1][head(order(pval), n=10), pathway]
  topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
  message(paste( length(topPathways), "pathways below FDR of 10%"))
  if(length(topPathways) > 0 ){
    plot.new()
    ggplot() + theme_void()
    plotGseaTable(pathways[topPathways], ranked_vector, fgseaRes, 
                  gseaParam=0.5)
  }
}

Analyses

Partial least squares discriminat analysis

pY all samples

Perform PLSDA
plsda_pY <- Tune_And_Perform_PLSDA(
  X = pY_mat_normtomedian,
  Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian ) ), 
         pattern = "_", simplify = TRUE )[,1] ),
  repeats = 50, folds = 3, initial_nr_of_components = 4,
  scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:

## Optimal number of components: 3
## Optimal number of selected features per component:
## comp1 comp2 comp3 
##    90     4    50
## Calculate final sPLSDA model

Plot Components
plotIndiv(plsda_pY, comp = c(1,2), # plot samples from final model
          col.per.group = PGPalette[c(1,2,4,5)],
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = ' (a) sPLS-DA on pY, comp 1 & 2')

plsda_pY$prop_expl_var
## $X
##      comp1      comp2      comp3 
## 0.36123832 0.07094834 0.20081173 
## 
## $Y
##     comp1     comp2     comp3 
## 0.3352754 0.3329572 0.2885741
Plot Loadings
plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])    

plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])   

#plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])  
Correlation Circle Plot
plotVar(plsda_pY, cex =2, comp = c(1,2), var.names =  list(stringr::str_split(plsda_pY$names$colnames$X, pattern = "_", simplify = T )[,1] ) )

#plotVar(plsda_pY, cex =2, comp = c(2,3), var.names =  list(stringr::str_split(plsda_pY$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY, comp = 1)$name 
GSEA
Run_GSEA_PLSDA(PLSDA_result = plsda_pY, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## Loading required namespace: reactome.db
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Run_GSEA_PLSDA(PLSDA_result = plsda_pY, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-50. You can set the `eps` argument to zero for better estimation.
## 10 pathways below FDR of 10%

StringDB
Plot_StringDB(
  rownames(plsda_pY$loadings$X[ (plsda_pY$loadings$X[,1] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

Plot_StringDB(
  rownames(plsda_pY$loadings$X[ (plsda_pY$loadings$X[,2] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

pY prep 1

Perform PLSDA
plsda_pY_prep1 <- Tune_And_Perform_PLSDA(
  X = pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep1"]) ),],
  Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep1"]) ),] ) ), 
         pattern = "_", simplify = TRUE )[,1] ),
  repeats = 50, folds = 3, initial_nr_of_components = 4,
  scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:

## Optimal number of components: 2
## Optimal number of selected features per component:
## comp1 comp2 
##    18    30
## Calculate final sPLSDA model

Plot Components
plotIndiv(plsda_pY_prep1, comp = c(1,2), # plot samples from final model
          col.per.group = PGPalette[c(1,2,4,5)],
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = ' (a) sPLS-DA on pY, comp 1 & 2')

plsda_pY_prep1$prop_expl_var
## $X
##      comp1      comp2 
## 0.35072718 0.09556697 
## 
## $Y
##     comp1     comp2 
## 0.3336348 0.3421679
Plot Loadings
plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])    

plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])   

#plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])  
Correlation Circle Plot
plotVar(plsda_pY_prep1, cex =2, comp = c(1,2), var.names =  list(stringr::str_split(plsda_pY_prep1$names$colnames$X, pattern = "_", simplify = T )[,1] ) )

#plotVar(plsda_pY_prep1, cex =2, comp = c(2,3), var.names =  list(stringr::str_split(plsda_pY_prep1$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY_prep1, comp = 1)$name 
GSEA
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep1, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-50. You can set the `eps` argument to zero for better estimation.
## 2 pathways below FDR of 10%

Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep1, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
StringDB
Plot_StringDB(
  rownames(plsda_pY_prep1$loadings$X[ (plsda_pY_prep1$loadings$X[,1] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

Plot_StringDB(
  rownames(plsda_pY_prep1$loadings$X[ (plsda_pY_prep1$loadings$X[,2] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

pY prep 2

Perform PLSDA
plsda_pY_prep2 <- Tune_And_Perform_PLSDA(
  X = pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep2"]) ),],
  Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep2"]) ),] ) ), 
         pattern = "_", simplify = TRUE )[,1] ),
  repeats = 50, folds = 3, initial_nr_of_components = 4,
  scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:

## Optimal number of components: 4
## Optimal number of selected features per component:
## comp1 comp2 comp3 comp4 
##   110    11     8     3
## Calculate final sPLSDA model

Plot Components
plotIndiv(plsda_pY_prep2, comp = c(1,2), # plot samples from final model
          col.per.group = PGPalette[c(1,2,4,5)],
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = ' (a) sPLS-DA on pY, comp 1 & 2')

plsda_pY_prep2$prop_expl_var
## $X
##      comp1      comp2      comp3      comp4 
## 0.35127891 0.11397880 0.04623242 0.04673407 
## 
## $Y
##     comp1     comp2     comp3     comp4 
## 0.3300180 0.2795119 0.3366914 0.2901813
Plot Loadings
plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])    

plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])   

#plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])  
Correlation Circle Plot
plotVar(plsda_pY_prep2, cex =2, comp = c(1,2), var.names =  list(stringr::str_split(plsda_pY_prep2$names$colnames$X, pattern = "_", simplify = T )[,1] ) )

#plotVar(plsda_pY_prep2, cex =2, comp = c(2,3), var.names =  list(stringr::str_split(plsda_pY_prep2$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY_prep2, comp = 1)$name 
GSEA
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep2, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep2, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
StringDB
Plot_StringDB(
  rownames(plsda_pY_prep2$loadings$X[ (plsda_pY_prep2$loadings$X[,1] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

Plot_StringDB(
  rownames(plsda_pY_prep2$loadings$X[ (plsda_pY_prep2$loadings$X[,2] != 0 ),] ) %>% 
  str_split(pattern = "_", simplify = T) %>% .[,1])

Session Info

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.2               stringr_1.4.1              
##  [3] dplyr_1.0.10                purrr_0.3.5                
##  [5] readr_2.1.3                 tidyr_1.2.1                
##  [7] tibble_3.1.8                tidyverse_1.3.2            
##  [9] mixOmics_6.18.1             ggplot2_3.3.6              
## [11] lattice_0.20-45             MASS_7.3-58.1              
## [13] mdatools_0.13.0             SummarizedExperiment_1.24.0
## [15] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
## [17] MatrixGenerics_1.6.0        matrixStats_0.62.0         
## [19] DEP_1.16.0                  org.Hs.eg.db_3.14.0        
## [21] AnnotationDbi_1.56.2        IRanges_2.28.0             
## [23] S4Vectors_0.32.4            Biobase_2.54.0             
## [25] BiocGenerics_0.40.0         fgsea_1.20.0               
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             shinydashboard_0.7.2   proto_1.0.0           
##   [4] gmm_1.7                tidyselect_1.2.0       RSQLite_2.2.18        
##   [7] htmlwidgets_1.5.4      grid_4.1.3             BiocParallel_1.28.3   
##  [10] norm_1.0-10.0          munsell_0.5.0          codetools_0.2-18      
##  [13] preprocessCore_1.56.0  chron_2.3-58           DT_0.26               
##  [16] withr_2.5.0            colorspace_2.0-3       highr_0.9             
##  [19] knitr_1.40             rstudioapi_0.14        mzID_1.32.0           
##  [22] labeling_0.4.2         GenomeInfoDbData_1.2.7 bit64_4.0.5           
##  [25] farver_2.1.1           vctrs_0.5.0            generics_0.1.3        
##  [28] xfun_0.34              R6_2.5.1               doParallel_1.0.17     
##  [31] clue_0.3-62            MsCoreUtils_1.6.2      bitops_1.0-7          
##  [34] cachem_1.0.6           DelayedArray_0.20.0    assertthat_0.2.1      
##  [37] promises_1.2.0.1       scales_1.2.1           googlesheets4_1.0.1   
##  [40] gtable_0.3.1           affy_1.72.0            sandwich_3.0-2        
##  [43] rlang_1.0.6            mzR_2.28.0             GlobalOptions_0.1.2   
##  [46] gargle_1.2.1           impute_1.68.0          broom_1.0.1           
##  [49] BiocManager_1.30.19    yaml_2.3.6             reshape2_1.4.4        
##  [52] modelr_0.1.9           backports_1.4.1        httpuv_1.6.6          
##  [55] tools_4.1.3            affyio_1.64.0          ellipsis_0.3.2        
##  [58] gplots_3.1.3           jquerylib_0.1.4        RColorBrewer_1.1-3    
##  [61] STRINGdb_2.6.5         MSnbase_2.20.4         gsubfn_0.7            
##  [64] Rcpp_1.0.9             hash_2.2.6.2           plyr_1.8.7            
##  [67] zlibbioc_1.40.0        RCurl_1.98-1.9         sqldf_0.4-11          
##  [70] GetoptLong_1.0.5       zoo_1.8-11             haven_2.5.1           
##  [73] ggrepel_0.9.1          cluster_2.1.4          fs_1.5.2              
##  [76] magrittr_2.0.3         data.table_1.14.4      RSpectra_0.16-1       
##  [79] circlize_0.4.15        reactome.db_1.77.0     reprex_2.0.2          
##  [82] googledrive_2.0.0      pcaMethods_1.86.0      mvtnorm_1.1-3         
##  [85] ProtGenerics_1.26.0    hms_1.1.2              mime_0.12             
##  [88] evaluate_0.17          xtable_1.8-4           XML_3.99-0.12         
##  [91] readxl_1.4.1           gridExtra_2.3          shape_1.4.6           
##  [94] compiler_4.1.3         ellipse_0.4.3          KernSmooth_2.23-20    
##  [97] ncdf4_1.19             crayon_1.5.2           htmltools_0.5.3       
## [100] corpcor_1.6.10         later_1.3.0            tzdb_0.3.0            
## [103] lubridate_1.8.0        DBI_1.1.3              dbplyr_2.2.1          
## [106] ComplexHeatmap_2.10.0  tmvtnorm_1.5           Matrix_1.5-1          
## [109] cli_3.4.1              vsn_3.62.0             imputeLCMD_2.1        
## [112] parallel_4.1.3         igraph_1.3.5           pkgconfig_2.0.3       
## [115] MALDIquant_1.21        xml2_1.3.3             foreach_1.5.2         
## [118] rARPACK_0.11-0         bslib_0.4.0            XVector_0.34.0        
## [121] rvest_1.0.3            digest_0.6.30          Biostrings_2.62.0     
## [124] rmarkdown_2.17         cellranger_1.1.0       fastmatch_1.1-3       
## [127] shiny_1.7.3            gtools_3.9.3           rjson_0.2.21          
## [130] lifecycle_1.0.3        jsonlite_1.8.3         limma_3.50.3          
## [133] fansi_1.0.3            pillar_1.8.1           KEGGREST_1.34.0       
## [136] fastmap_1.1.0          httr_1.4.4             plotrix_3.8-2         
## [139] glue_1.6.2             png_0.1-7              iterators_1.0.14      
## [142] bit_4.0.4              stringi_1.7.8          sass_0.4.2            
## [145] blob_1.2.3             caTools_1.18.2         memoise_2.0.1
knitr::knit_exit()